home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Chip: Internet
/
Chip Internet.iso
/
wwwutil
/
hotjava.ins
/
hotjava.exe
/
hotjava
/
classsrc
/
browser
/
tools
/
JavaSearch
/
Searcher.java
< prev
next >
Wrap
Text File
|
1995-08-11
|
13KB
|
391 lines
/*
* @(#)Searcher.java 1.19 95/03/20 David A. Brown
*
* Copyright (c) 1994 Sun Microsystems, Inc. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* and its documentation for NON-COMMERCIAL purposes and without
* fee is hereby granted provided that this copyright notice
* appears in all copies. Please refer to the file "copyright.html"
* for further important copyright and licensing information.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
* ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
* DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*/
package browser.tools.JavaSearch;
import java.util.Vector;
import java.util.Enumeration;
import java.io.*;
/**
* Searcher: Interface for *searching* a JavaSearch database.
*
*/
class Searcher {
/** The database we're searching */
Database db;
/** The window driving us, if any (for messages; can be null). */
SearchWindow searchWindow;
/** The stop list, which we use to reject unimportant words. **/
StopList stopList;
/** How many words we've read out of the index so far
* doing the current search. */
int indexReadCounter;
// Modes used internally by doSearch()
static final int OR_MODE = 0;
static final int AND_MODE = 1;
static final int NOT_MODE = 2;
/** Searcher constructor. Specify a Database object and window. */
public Searcher(Database theDB, SearchWindow sWin) {
//System.out.println("Searcher [Database "+theDB.dbName+"]...");
db = theDB;
searchWindow = sWin;
stopList = new StopList(System.getenv("HOTJAVA_HOME")+File.separator+
"doc"+File.separator+
StopList.defaultFile);
}
/** Searcher constructor. Specify a Database object. */
public Searcher(Database theDB) {
this(theDB, null);
}
/**
* Main "Search Engine" function, specifying a search string.
*
* Specify a search string, consisting of keywords to search for,
* along with optional boolean keywords "and", "or" and "not".
* Words must be separated by spaces!!
* This method just breaks the search string into a Vector
* of search words, and calls doSearch(Vector searchWords)
* to do the actual searching.
*
* See below for details.
*/
public DocList doSearch(String searchString) {
//System.out.println("doSearch: searchString '"+searchString+"'...");
if ((searchString == null) || (searchString.length()==0)) {
return null;
}
Vector wordVector = new Vector();
int wordStart = 0;
// Break apart searchString, adding keywords to wordVector.
while (wordStart<searchString.length()) {
// Skip over any spaces in searchString
while (searchString.charAt(wordStart) == ' ') {
wordStart++;
if (wordStart >= searchString.length()) break;
}
if (wordStart >= searchString.length()) break;
// Ok, wordStart really points to the start of a word.
int wordEnd = searchString.indexOf(' ',wordStart);
if (wordEnd == -1) wordEnd = searchString.length();
String keyword = searchString.substring(wordStart,wordEnd);
wordVector.addElement(keyword);
wordStart = wordEnd+1;
}
return doSearch(wordVector);
}
/**
* Main "Search Engine" function, specifying a Vector of
* search words.
*
* The searchWords parameter is a Vector of words representing
* a "search string" (ie. keywords to search for, mixed in with
* the optional boolean keywords "and", "or" and "not".)
*
* The search words are processed left to right
* (i.e. in the order they appear in the Vector), with any
* boolean keyword affecting how the *subsequent* word's
* hits are merged into the final result. Note that
* "not" means "and not".
*
* Example: the Vector for "foo and bar or baz not mumble" is
* processed as follows:
* - Find all documents containing "foo"
* - INTERSECT this result with all documents containing "bar"
* - Now ADD all documents containing "baz" to the result
* - INTERSECT this result with all documents NOT
* containing "mumble"
*
* Again, remember "not" means "and not". This means
* that the results you get from searching for simply "not foo"
* might not be what you expect!
*
* This method returns a DocList containing all documents
* which match the search string, in Doc ID order,
* or null if no documents match the search string.
*/
public DocList doSearch(Vector searchWords) {
//System.out.println("doSearch: Number of searchWords: "+
// searchWords.size()+"...");
//System.out.println("doSearch: searchWords "+searchWords);
// Start of with an empty set of Doc IDs
IDVector resultVector = new IDVector();
// The query words we've dropped due to presence on stop list
Vector stoppedWords = new Vector(2);
// The "boolean mode" to use when combining
// our next set of Doc IDs with the
// result set we're accumulating.
int booleanMode = OR_MODE; // OR is the default
boolean onlyStopWords = true; // so far we've only seen stop words
Enumeration e = searchWords.elements();
while (e.hasMoreElements()) {
String keyword = (String)e.nextElement();
// Make sure keyword is all lower-case
keyword = IndexingInputStream.downcase(keyword);
//System.out.println("doSearch: processing keyword '"+keyword+"'.");
// If the word is a boolean control keyword,
// update booleanMode and continue
// on to the next word
if (keyword.equals("or")) {
booleanMode = OR_MODE;
continue;
}
else if (keyword.equals("and")) {
booleanMode = AND_MODE;
continue;
}
else if (keyword.equals("not")) {
booleanMode = NOT_MODE;
continue;
}
// Is it on our stop list? If so, skip it.
if (stopList.isStopWord(keyword)) {
stoppedWords.addElement(keyword);
continue;
}
// Ok, keyword is a real word. Get an IDVector for it.
Word w = getWordFromIndex(keyword);
//System.out.println("doSearch: getWordFromIndex returned: "+w);
// Now we have a bunch of Doc IDs in w.idvector
// (or, w may be null!)
// Merge that vector into resultVector, using
// the current booleanMode:
switch (booleanMode) {
case AND_MODE:
if (w == null) {
// ANDing with a null set gives you a null set!
System.out.println("ANDing with a null set: clearing resultVector!");
resultVector.clear();
}
else {
// If we've only seen stopwords so far, don't bother to AND
// the null result with anything.
if (onlyStopWords) {
resultVector = w.idvector;
} else {
resultVector.intersectWith(w.idvector);
}
}
break;
case NOT_MODE:
if (w == null) {
// "AND NOT <null set>" doesn't change
// what you started with.
//System.out.println("NOT with a null set; resultVector unaffected.");
}
else {
resultVector.intersectWithNot(w.idvector);
}
break;
default: // OR mode
if (w == null) {
// "OR <null set>" doesn't change what you started with.
//System.out.println("OR with a null set; resultVector unaffected.");
}
else {
resultVector.unionWith(w.idvector);
}
break;
}
// Reset booleanMode for the next word (we imply OR
// if there's no boolean control keyword)
booleanMode = OR_MODE;
// If we've gotten this far, we're seeing real words.
onlyStopWords = false;
}
// Done with the search! Results are in resultVector.
// Display stopped words, if any.
if (stoppedWords.size() > 0) {
if (searchWindow == null) {
System.out.print("Searcher: Discarded ");
for (int i = 0; i < stoppedWords.size(); i++) {
System.out.print(stoppedWords.elementAt(i)+" ");
}
System.out.print("\n");
} else {
searchWindow.displayStoppedWords(stoppedWords);
}
}
if (resultVector.count == 0) {
//System.out.println("doSearch: resultVector.count is 0!");
return null;
}
//System.out.println("doSearch: resultVector.count is "+
// resultVector.count);
// REMIND: Would be cool to have a MAX_SEARCH_HITS paramater
// here, so we don't create a DocList if we have an absurdly
// large number of hits.
//
// If (resultVector.count > MAX_SEARCH_HITS), we could either
// truncate resultVector to an acceptable length,
// or (better) return a special value indicating
// we had too many hits (so the user should refine his
// search, and try again...)
//
// Actually, it would be cool for this method to
// somehow return a number_of_hits *and* a DocList!
// So if the DocList comes back null, the
// number_of_hits tells you whether your got NO hits,
// or too many hits.
// Generate and return a new DocList based on resultVector --
// this involves opening the .docs and .docindex files,
// and reading a Doc object for each hit.
DocList dl = new DocList(db, resultVector);
return dl;
}
/** Look up the specified word in the index, and return a
* Word object. Return null if the word wasn't found. */
public synchronized Word getWordFromIndex(String aWord) {
//System.out.println("getWordFromIndex: finding '"+aWord+"'...");
indexReadCounter = 0;
// Open the index and qindex files
RandomAccessFile indexFile =
new RandomAccessFile(db.indexFilename,"r");
RandomAccessFile qindexFile =
new RandomAccessFile(db.qindexFilename,"r");
int numWords = qindexFile.length()/4;
//System.out.println("getWordFromIndex: qindexFile has "+
// numWords+" entries.");
// Binary search the qindex file (each qindex
// entry points at a word in the index file)
Word resultWord = bsearchWord(aWord,
0, numWords-1,
qindexFile, indexFile);
//System.out.println("getWordFromIndex: Got resultWord: "+resultWord);
//System.out.println(" Did "+indexReadCounter+" read"+
// ((indexReadCounter==1)?"":"s")+
// " from the index/qindex files.");
return resultWord;
}
//
// These methods are the guts of the getWordFromIndex() functionality
//
/**
* Binary-search index file for the specified word.
* Returns a Word object if the word is found;
* Returns null if the word is not in the index.
* Calls itself recursively.
*/
private Word bsearchWord(String word, int loPos, int hiPos,
RandomAccessFile qindexFile,
RandomAccessFile indexFile) {
//System.out.println(" bsearchWord ("+word+"), range ["+
// loPos+","+hiPos+"]...");
if (loPos > hiPos) {
//System.out.println(" loPos > hiPos. ["+loPos+","+hiPos+"]. Returning null.");
return null;
}
else if (loPos == hiPos) {
if (word.equals(getWordAtPos(loPos, qindexFile, indexFile))) {
return new Word(word,indexFile);
}
else {
return null;
}
}
else {
// loPos and hiPos describe a range. Check the
// *middle* of the range, and recurse.
int midPos = (loPos+hiPos)/2;
String midWord = getWordAtPos(midPos, qindexFile, indexFile);
//System.out.println(" midPos "+midPos+
// ", midWord '"+midWord+"'.");
int compareResult = word.compareTo(midWord);
if (compareResult == 0) {
// Direct hit!
return new Word(word,indexFile);
}
else if (compareResult < 0) {
// word < midWord: Search the lower half
return bsearchWord(word, loPos, midPos-1,
qindexFile, indexFile);
}
else {
// word > midWord: Search the upper half
return bsearchWord(word, midPos+1, hiPos,
qindexFile, indexFile);
}
}
}
/** Return the word found at the Nth position in the index file.
* Leaves indexFile pointing at the word's doc entries. */
private String getWordAtPos(int n, RandomAccessFile qindexFile,
RandomAccessFile indexFile) {
//System.out.print(" getWordAtPos "+n+"... ");
// Read an index file position from the qindex file.
// qindex entries are ints, which are 4 bytes long.
qindexFile.seek(n*4);
int indexPos = qindexFile.readInt();
// Read just the word (a String) from the index file
indexFile.seek(indexPos);
String word = indexFile.readLine();
//System.out.println("found word '"+word+"'.");
indexReadCounter++; // One indexReadCounter increment
// means one int from the qindexFile
// and one String from the indexFile.
return word;
}
}